program ORIVCorrelation, rclass
   /*
   * This package implements the ORIV correlation estimator presented in
   *   'Experimenting with Measurement Error: Techniques with Applications to the Caltech Cohort Study'
   *   by Gillen, Snowberg, and Yariv
   *
   * The package was put together by Erik Snowberg and Adlai Newson. Comments
   * should be sent to Erik Snowberg, at snowberg@mail.ubc.ca
   *
   * Basic Usage:
   *    ORIVCorrelation, group1(x1 x2 ...) group2(y1 y2 ...) bsreps(5000)
   * 
   * Output: 
   *    r(corr): the ORIV estimated correlation 
   *    r(se):   the boostrapped standard error of the estimated correlation r(corr)
   *    r(asym_se):  the asymptotic standard error of the estimated correlation r(corr)
   */

   version 12  

   tempfile tempo // tempfile for original data
   quietly save `tempo',replace // restore the data back to user afterwards

   *** (0) Syntax handling, misc
   syntax [if] [pweight /], GROUP1(varlist numeric) GROUP2(varlist numeric) [equalvar BSREPS(integer 5000)] 

   if `bsreps' < 0 {
      di "Bootstrap reps must be a nonnegative integer - use option bsreps(0) to skip the bootstrap"
      error 1
   }

   ** Keep only the data that satisfies the [if] option
   if "`if'" != "" {
      quietly keep `if'
   }

   ** Generate weight macros
   if "`weight'" != "" {
      gen __weightvar = `exp'
      local pw_exp =  "[pweight=__weightvar]" // used for ivreg
      local aw_exp =  "[aweight=__weightvar]" // used for summarize,corr
   }
   else {
      local pw_exp = ""
      local aw_exp = ""
      quietly gen __weightvar = .
   }

   ** Get original sample size
   local N = _N
   ** Count, rename, and standardize the input variables (group 1)
   local input_n1 = 0
   foreach var in `group1' { 
      local ++input_n1
      keep if ~missing(`var')
      quietly rename `var' __input1_`input_n1'
      quietly summarize __input1_`input_n1' `aw_exp'
      quietly replace __input1_`input_n1' = (__input1_`input_n1'-r(mean))/r(sd)
   }

   ** Count, rename, and standardize the input variables (group 2)
   local input_n2 = 0
   foreach var in `group2' { 
      local ++input_n2
      keep if ~missing(`var')
      quietly rename `var' __input2_`input_n2'
      quietly summarize __input2_`input_n2' `aw_exp'
      quietly replace __input2_`input_n2' = (__input2_`input_n2'-r(mean))/r(sd)
   }

   quietly keep __input1_* __input2_*  __weightvar


   *** (1) Constructing the Long Dataset

   ** Define the LHS,RHS variables according the size of input groups
   if `input_n1'==1 & `input_n2'>1 {
      * In this case input group 1 is LHS variable (v1)
      rename __input1_* v1_*
      rename __input2_* v2_*
      local n1 = `input_n1'
      local n2 = `input_n2'
      local one_and_many "true"

   }
   if `input_n2'==1 & `input_n1'>1 {
      * In this case input group 2 is LHS variable (v1)
      rename __input1_* v2_*
      rename __input2_* v1_*
      local n1 = `input_n2'
      local n2 = `input_n1'
      local one_and_many "true"

   }

   if `input_n2'>1 & `input_n1'>1 {
      rename __input1_* v1_*
      rename __input2_* v2_*
      local n1 = `input_n1'
      local n2 = `input_n2'
      local many_and_many "true"

   }
   if `input_n2'==1 & `input_n1'==1 {
      rename __input1_1 v1_1
      rename __input2_1 v2_1
      local n1 = 1
      local n2 = 1
      local one_and_one = "true"

   }

   quietly generate id = _n
   quietly generate LHS = . 
   quietly generate RHS = . 

   ** Expand the data from length n to length n*ny*nx:
   quietly expand `n1' // duplicate each id by number of v1 variables n1, and populate LHS
   bysort id: generate v1copy = _n
   forvalues i=1/`n1' {
      quietly bysort id: replace LHS = v1_`i' if _n==`i'
   }

   quietly expand `n2' // duplicate each id*n1 by number of v2 variables n2, and populate RHS
   bysort id v1copy: generate v2copy = _n
   forvalues i=1/`n2' {
      quietly bysort id v1copy: replace RHS = v2_`i' if _n==`i'

      * Each v2 has (n2-1) instruments, this loop assigns each permutation the correct IVs
      quietly generate IV`i' = .
      local iminus1 =`i' - 1
      capture replace IV`i' = v2_`i' if `i' < v2copy 
      capture replace IV`iminus1' = v2_`i' if `i' > v2copy

   }
   drop IV`n2' // The loop creates one extra, unused IV variable

   * Generate LHS,RHS permutation-specific intercepts
   bysort id (v1copy v2copy): generate copy = _n 
   quietly tab copy,gen(perm_int)

   **** (2) Estimation

   ** (2.1) Compute full sample corrected correlation
   if "`one_and_one'" != "true" { 
      ** In all cases but the 1x1 case, get the IV estimator
      quietly ivreg LHS (RHS = IV*) perm_int*  `pw_exp', nocons cluster(id)
      local correctedCoefficient = _b[RHS]
      local asymp_se = _se[RHS]
   }

   *** ONE AND MANY CASE
   *********************
   if "`one_and_many'" == "true" {
      display "Warning: in the 1xK case consistency is by assumption, and not guaranteed" 

      ** Step 1: Calculate RHS sd
      * There are two ways to calculate correlation in this case; see program option 'equalvar'
      if "`equalvar'" == "equalvar" {
         * In this case, take the sd of the RHS variables over the stacked individuals obs
         display "Equal variance assumed"
         quietly summarize RHS `aw_exp'
         local correctedRHSVar = r(Var)
      }
      else {
         * In this case, average over the pairwise RHS correlations
         display "Equal variance not assumed (default)"
         local correctedRHSVar = 0
         scalar nperms_rhs = 0
         forvalues i=1/`n2' {
            forvalues j=1/`i' {
               if `i' != `j' {
                  quietly correlate  v2_`i' v2_`j' if v1copy==1 & v2copy==1  `aw_exp', cov
                  local correctedRHSVar = `correctedRHSVar'+r(cov_12)
                  scalar nperms_rhs = nperms_rhs + 1
               }
            }
         }
         local correctedRHSVar = `correctedRHSVar'/scalar(nperms_rhs)
      }

      ** Step 2: Calculate LHS sd
      quietly summarize v1_1 if v1copy==1 & v2copy==1 `aw_exp' 
      local correctedLHSVar = r(Var)

      local fs_corrected_corr =`correctedCoefficient'*sqrt(`correctedRHSVar'/`correctedLHSVar')
      local asymp_se = `asymp_se'*sqrt(`correctedRHSVar'/`correctedLHSVar')
   }

   *** MANY AND MANY CASE
   *********************
   if "`many_and_many'" == "true" {
      ** Many-and-many case: use pairwise correlations for both variable groups

      ** Get variance of RHS 
      local correctedRHSVar = 0
      scalar nperms_rhs = 0
      forvalues i=1/`n2' {
         forvalues j=1/`i' {
            if `i' != `j' {
               quietly correlate  v2_`i' v2_`j' if v1copy==1 & v2copy==1  `aw_exp', cov
               local correctedRHSVar = `correctedRHSVar'+r(cov_12)
               scalar nperms_rhs = nperms_rhs + 1
            }
         }
      }
      local correctedRHSVar = `correctedRHSVar'/scalar(nperms_rhs)

      ** Get variance of LHS 
      local correctedLHSVar = 0
      scalar nperms_lhs = 0
      forvalues i=1/`n1' {
         forvalues j=1/`i' {
            if `i' != `j' {
               quietly correlate v1_`i' v1_`j' if v1copy==1 & v2copy==1  `aw_exp', cov
               local correctedLHSVar = `correctedLHSVar'+r(cov_12)
               scalar nperms_lhs = nperms_lhs + 1
            }
         }
      }
      local correctedLHSVar = `correctedLHSVar'/scalar(nperms_lhs)
      
      local fs_corrected_corr =`correctedCoefficient'*sqrt(`correctedRHSVar'/`correctedLHSVar')
      local asymp_se = `asymp_se'*sqrt(`correctedRHSVar'/`correctedLHSVar')
   }

   *** ONE AND ONE CASE
   *********************
   if "`one_and_one'"=="true" {
      quietly reg v1_1 v2_1 `pw_exp'
      local fs_corrected_b = _b[v2_1]
      local fs_corrected_se = _se[v2_1]
      quietly summarize v1_1 `aw_exp'
      local fs_correctedLHSsd = r(sd)
      quietly summarize v2_1 `aw_exp'
      local fs_corrected_corr = `fs_corrected_b'*r(sd)/`fs_correctedLHSsd'
      local asymp_se = `fs_corrected_se'*r(sd)/`fs_correctedLHSsd'
   }

   *********************
   *** (3) BOOTSTRAP
   *********************

   if `bsreps'>0 { 
      tempfile bs_results
      tempname bs_store_data
      quietly postfile `bs_store_data' bs_rho using `bs_results',replace

      *** ONE AND ONE CASE
      *********************
      if "`one_and_one'"=="true" {
         forvalues bb = 1/`bsreps'  {
            preserve
               bsample
                     quietly correlate v1_1 v2_1 `aw_exp' 
                     scalar bs_correctedRho = r(rho)
                     post `bs_store_data' (scalar(bs_correctedRho))
           restore
         }
      }
               
      *** ONE AND MANY CASE
      *********************
      if "`one_and_many'"=="true" {

         *** Split into the 2x1 case and Kx1 case (K>2)

         *** 2x1 CASE:
         if `n2' == 2 {
               forvalues bb = 1/`bsreps'  {
                  preserve
                     bsample, cluster(id)
                        quietly ivreg LHS (RHS = IV*) perm_int*  `pw_exp', nocons 
                        local bs_correctedCoefficient = _b[RHS]

                        * There are two ways to calculate correlation in this case; see program option 'equalvar'
                        if "`equalvar'" == "equalvar" {
                           * In equalvar case, take the sd of the RHS variables over the stacked individuals obs
                           quietly summarize RHS `aw_exp' 
                           local bs_correctedRHSVar = r(Var)

                           ** Step 2: Calculate LHS sd
                           quietly summarize v1_1 if v1copy==1 & v2copy==1 `aw_exp' 
                           local bs_correctedLHSVar = r(Var)

                           * Calculate the corrected correlation
                           scalar bs_correctedRho = `bs_correctedCoefficient'*sqrt(`bs_correctedRHSVar'/`bs_correctedLHSVar')
                           post `bs_store_data' (scalar(bs_correctedRho))
                        }

                        else {
                           * In default case, average over the pairwise RHS correlations
                           quietly keep if v1copy==1 & v2copy==1
                           quietly correlate v2_1 v2_1 `aw_exp', cov
                           local bs_correctedRHSVar = r(cov_12)

                           ** Step 2: Calculate LHS sd
                           quietly summarize v1_1 `aw_exp' 
                           local bs_correctedLHSVar = r(Var)

                           * Calculate the corrected correlation
                           scalar bs_correctedRho = `bs_correctedCoefficient'*sqrt(`bs_correctedRHSVar'/`bs_correctedLHSVar')
                           post `bs_store_data' (scalar(bs_correctedRho))
                        }

                  restore
               }
         }
         *** Kx1 CASE:
         else {
            forvalues bb = 1/`bsreps'  {
               preserve
                  bsample, cluster(id)
                     quietly ivreg LHS (RHS = IV*) perm_int*  `pw_exp', nocons
                     local bs_correctedCoefficient = _b[RHS]

                     ** Step 1: Calculate RHS sd
                     * There are two ways to calculate correlation in this case; see program option 'equalvar'
                     if "`equalvar'" == "equalvar" {
                        * In equalvar case, take the sd of the RHS variables over the stacked individuals obs
                        quietly summarize RHS `aw_exp' 
                        local bs_correctedRHSVar = r(Var)
                        quietly keep if v1copy==1 & v2copy==1
                     }
                     else {
                        * In default case, average over the pairwise RHS correlations
                        quietly keep if v1copy==1 & v2copy==1
                        local bs_correctedRHSVar = 0
                        forvalues i=1/`n2' {
                           forvalues j=1/`i' {
                              if `i' != `j' {
                                 quietly correlate v2_`i' v2_`j' `aw_exp' , cov
                                 local bs_correctedRHSVar = `correctedRHSVar'+r(cov_12)
                              }
                           }
                        }
                        local bs_correctedRHSVar = `bs_correctedRHSVar'/scalar(nperms_rhs)
                     }

                     ** Step 2: Calculate LHS variance
                     quietly summarize v1_1 `aw_exp' 
                     local bs_correctedLHSVar = r(Var)

                     * Calculate the corrected correlation
                     scalar bs_correctedRho = `bs_correctedCoefficient'*sqrt(`bs_correctedRHSVar'/`bs_correctedLHSVar')
                     post `bs_store_data' (scalar(bs_correctedRho))
               restore
            }
         }
      }

      *** MANY AND MANY CASE
      *********************
      if "`many_and_many'"=="true" {

         *** Split into 2x2 case and MxK case (M,K > 2)

         ** 2x2 CASE
         if `n1'==2 & `n2'==2 {
            forvalues bb = 1/`bsreps'  {
               preserve
                  bsample, cluster(id)
                     quietly ivreg LHS (RHS = IV*) perm_int*  `pw_exp', nocons
                     local bs_correctedCoefficient = _b[RHS]

                     quietly keep if v1copy==1 & v2copy==1
                     * Calculate average of covariances for RHS variables
                     quietly correlate  v2_1 v2_2  `aw_exp', cov
                     local bs_correctedRHSVar = r(cov_12)

                     * Calculate average of covariances for LHS variables
                     quietly correlate v1_1 v1_2  `aw_exp', cov
                     local bs_correctedLHSVar = r(cov_12)

                     * Calculate the corrected correlation
                     scalar bs_correctedRho = `bs_correctedCoefficient'*sqrt(`bs_correctedRHSVar'/`bs_correctedLHSVar')
                     post `bs_store_data' (scalar(bs_correctedRho))
               restore
            }

         }

         ** MxK CASE
         else {
            forvalues bb = 1/`bsreps'  {
               preserve
                  bsample, cluster(id)
                     quietly ivreg LHS (RHS = IV*) perm_int*  `pw_exp', nocons
                     local bs_correctedCoefficient = _b[RHS]
                     quietly keep if v1copy==1 & v2copy==1 

                     * Calculate average of pairwise covariances for RHS variables
                     local bs_correctedRHSVar = 0
                     forvalues i=1/`n2' {
                        forvalues j=1/`i' {
                           if `i' != `j' {
                              quietly correlate  v2_`i' v2_`j'  `aw_exp', cov
                              local bs_correctedRHSVar = `bs_correctedRHSVar'+r(cov_12)
                           }
                        }
                     }
                     local bs_correctedRHSVar = `bs_correctedRHSVar'/scalar(nperms_rhs)

                     * Calculate average of pairwise covariances for LHS variables
                     local bs_correctedLHSVar = 0
                     forvalues i=1/`n1' {
                        forvalues j=1/`i' {
                           if `i' != `j' {
                              quietly correlate v1_`i' v1_`j'  `aw_exp', cov
                              local bs_correctedLHSVar = `bs_correctedLHSVar'+r(cov_12)
                           }
                        }
                     }
                     local bs_correctedLHSVar = `bs_correctedLHSVar'/scalar(nperms_lhs)

                     * Calculate the corrected correlation
                     scalar bs_correctedRho = `bs_correctedCoefficient'*sqrt(`bs_correctedRHSVar'/`bs_correctedLHSVar')
                     post `bs_store_data' (scalar(bs_correctedRho))
               restore
            }
         }
      }

      postclose `bs_store_data'
   }


   *********************
   *** (4) CALCULATE TSTAT,PVALUE
   *********************
   quietly summarize id
   local N = r(max)

   if `bsreps'>0 {
      use `bs_results',clear
      quietly summarize bs_rho
      local se = r(sd)
   }
   else {
      local se = `asymp_se'
   }

   local tstat = `fs_corrected_corr'/`se'
   local pval = ttail(`N',`tstat') // get size of sample; not clear what N to use though

   *********************
   *** (5) REPORT & RETURN
   *********************
   * Return the estimated correlation
   return scalar corr = `fs_corrected_corr'

   * Return the se
   return scalar se = `se'

   * Return pvalue,tstat
   return scalar tstat = `tstat'
   return scalar pval = `pval'

   * Display results
   display "Estimated correlation is: " string(`fs_corrected_corr',"%9.8f")
   if `bsreps'>0 {
      display "   with boostrap s.e.   : " string(`se',"%9.8f")
   }
   else {
      display "   with asymptotic s.e  : " string(`se',"%9.8f")
      display "   (Note: not valid for testing hypotheses other than rho = 0)"
   }

   * Restore data to original shape
   use `tempo',clear

end

